from allennlp.data.dataset_readers import DatasetReader as AllenDatasetReader
from allennlp.data.fields import TextField, LabelField
from allennlp.data import Instance, Token

from config import Config


class DatasetReader(AllenDatasetReader):
    def __init__(self, cf: Config, token_type=None, lazy=False):
        super(DatasetReader, self).__init__(lazy=lazy)
        self.cf = cf
        if token_type is None:
            token_type = cf.token
        self.cf_token = cf.token_config(token_type)

        self.max_len = self.cf_token['max_len'][self.cf.dataset]

        self.tokenizer = self.cf_token['tokenizer']
        self.token_indexers = self.cf_token['token_indexers']
        # Indexer 的 namespace 对应 Vocabulary 的 namespace
        # token_indexers 的 'tokens' 对应 Embedding 的 namespace

        if 'filter' in self.cf_token:
            self.filter = self.cf_token['filter']
        else:
            self.filter = None

    def text_to_instance(self, sentence, label: str = None) -> Instance:
        sentence = sentence.replace('@@UNKNOWN@@', self.tokenizer.unk)
        tokens = self.tokenizer.tokenize(sentence)

        if self.max_len > 0 and len(tokens) > self.max_len:
            if self.filter is not None:
                sentence = self.filter.filter(sentence)
                tokens = self.tokenizer.tokenize(sentence)

            tokens = tokens[:self.max_len]
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {'sentence': sentence_field}
        if label:
            label_field = LabelField(label)
            fields['label'] = label_field
        return Instance(fields)

    def text_to_json(self, sentence, label: str = None):
        return {'sentence': sentence,
                'label': label}

    def instance_to_text(self, instance):
        sentence = self.tokenizer.detokenize([t.text for t in instance['sentence']])
        label = instance['label'].label
        return {'sentence': sentence, 'label': label}
